{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 探索性数据分析"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import lib\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import auc, roc_auc_score, accuracy_score, confusion_matrix, mean_squared_error\n",
    "# from sklearn.ensemble import GradientBoostingClassifier\n",
    "import xgboost as xgb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import data\n",
    "X = pd.read_csv('../data/train.csv')\n",
    "y = pd.read_csv('../data/train_target.csv')\n",
    "newX = pd.read_csv('../data/test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 132029 entries, 0 to 132028\n",
      "Columns: 104 entries, id to isNew\n",
      "dtypes: float64(3), int64(101)\n",
      "memory usage: 104.8 MB\n"
     ]
    }
   ],
   "source": [
    "X.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 132029 entries, 0 to 132028\n",
      "Data columns (total 2 columns):\n",
      "id        132029 non-null int64\n",
      "target    132029 non-null int64\n",
      "dtypes: int64(2)\n",
      "memory usage: 2.0 MB\n"
     ]
    }
   ],
   "source": [
    "y.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "id                      0\n",
       "certId                  0\n",
       "loanProduct             0\n",
       "gender                  0\n",
       "age                     0\n",
       "dist                    0\n",
       "edu                     0\n",
       "job                     0\n",
       "lmt                     0\n",
       "basicLevel              0\n",
       "x_0                     0\n",
       "x_1                     0\n",
       "x_2                     0\n",
       "x_3                     0\n",
       "x_4                     0\n",
       "x_5                     0\n",
       "x_6                     0\n",
       "x_7                     0\n",
       "x_8                     0\n",
       "x_9                     0\n",
       "x_10                    0\n",
       "x_11                    0\n",
       "x_12                    0\n",
       "x_13                    0\n",
       "x_14                    0\n",
       "x_15                    0\n",
       "x_16                    0\n",
       "x_17                    0\n",
       "x_18                    0\n",
       "x_19                    0\n",
       "                    ...  \n",
       "x_64                    0\n",
       "x_65                    0\n",
       "x_66                    0\n",
       "x_67                    0\n",
       "x_68                    0\n",
       "x_69                    0\n",
       "x_70                    0\n",
       "x_71                    0\n",
       "x_72                    0\n",
       "x_73                    0\n",
       "x_74                    0\n",
       "x_75                    0\n",
       "x_76                    0\n",
       "x_77                    0\n",
       "x_78                    0\n",
       "certValidBegin          0\n",
       "certValidStop           0\n",
       "bankCard            20152\n",
       "ethnic                  0\n",
       "residentAddr            0\n",
       "highestEdu              0\n",
       "linkRela                0\n",
       "setupHour               0\n",
       "weekday                 0\n",
       "ncloseCreditCard        0\n",
       "unpayIndvLoan           0\n",
       "unpayOtherLoan          0\n",
       "unpayNormalLoan         0\n",
       "5yearBadloan            0\n",
       "isNew                   0\n",
       "Length: 104, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# missing value\n",
    "X.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = X.drop(columns=['id', 'certId', 'bankCard', 'isNew'])\n",
    "y = y['target']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# split data\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    X, y, test_size=0.33, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
       "              colsample_bynode=1, colsample_bytree=1, eval_metric='auc',\n",
       "              gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,\n",
       "              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,\n",
       "              nthread=None, objective='binary:logistic', random_state=42,\n",
       "              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n",
       "              silent=None, subsample=1, verbosity=1)"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# xgboost\n",
    "xgb_model = xgb.XGBClassifier(objective=\"binary:logistic\", random_state=42, eval_metric=\"auc\")\n",
    "xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.7044865180005394"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pred = xgb_model.predict_proba(X_test)[:,1]\n",
    "roc_auc_score(y_test, y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "file_extension": ".py",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  },
  "mimetype": "text/x-python",
  "name": "python",
  "npconvert_exporter": "python",
  "pygments_lexer": "ipython3",
  "toc": {
   "nav_menu": {},
   "number_sections": false,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "version": 3
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
